{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
    "\n",
    "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tokenizers import AddedToken\n",
    "\n",
    "tokenizer.add_special_tokens({\"additional_special_tokens\": [AddedToken('\\n'), AddedToken('\\t'),\n",
    "                                                           AddedToken(' ')]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/transformers/utils/hub.py:651: UserWarning: The `organization` argument is deprecated and will be removed in v5 of Transformers. Set your organization directly in the `repo_id` passed instead (`repo_id={organization}/{model_id}`).\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/translation-t5-base-standard-bahasa-cased/commit/45f0bafc8c490fb2d1b44bd2eced30aec0c191d4', commit_message='Upload tokenizer', commit_description='', oid='45f0bafc8c490fb2d1b44bd2eced30aec0c191d4', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('translation-t5-base-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_special_ids = [0, 1, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['finetune-t5-base-standard-bahasa-cased/checkpoint-3640000',\n",
       " 'finetune-t5-base-standard-bahasa-cased/checkpoint-3650000',\n",
       " 'finetune-t5-base-standard-bahasa-cased/checkpoint-3660000']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "checkpoints = sorted(glob('finetune-t5-base-standard-bahasa-cased/checkpoint-*'))\n",
    "checkpoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hey, ada sesiapa boleh saya bantu?\n"
     ]
    }
   ],
   "source": [
    "s = 'Hai, ada yang bisa saya bantu?'\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "haa ni ada yang bole tolong x\n"
     ]
    }
   ],
   "source": [
    "s = 'Hai, ada yang bisa saya bantu?'\n",
    "input_ids = tokenizer.encode(f'terjemah ke pasar Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello, could you help me?\n"
     ]
    }
   ],
   "source": [
    "s = 'Hai, ada yang bisa saya bantu?'\n",
    "input_ids = tokenizer.encode(f'terjemah ke Inggeris: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100, )\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Maaf, saya tidak dapat memahami permintaan anda. Bolehkah anda memberikan lebih banyak maklumat atau konteks tentang apa yang anda maksudkan dengan \"Pegangan Pertanian\"? Saya akan cuba membantu sebaik mungkin selepas saya mendapat maklumat yang lebih jelas.\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "Maaf, saya tidak dapat memahami permintaan Anda. Bisakah Anda memberikan lebih banyak informasi atau konteks tentang apa yang Anda maksud dengan \"Pegangan Pertanian\"? Saya akan mencoba membantu sebaik mungkin setelah saya mendapatkan informasi yang lebih jelas.\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " \n",
      "Maaf, saya tak dapat nak faham permintaan awak. Boleh tak bagi lebih banyak maklumat atau konteks tentang apa yang awak maksudkan dengan \"Pegangan Pertanian\"? Saya akan cuba bantu sebaik mungkin setelah saya mendapat maklumat yang lebih jelas.\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "Maaf, saya tidak dapat memahami permintaan Anda. Bisakah Anda memberikan lebih banyak informasi atau konteks tentang apa yang Anda maksud dengan \"Pegangan Pertanian\"? Saya akan mencoba membantu sebaik mungkin setelah saya mendapatkan informasi yang lebih jelas.\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke pasar Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<p>Sorry, I cannot understand your request. Could you please provide more information or context about what you mean by \"Agricultural Hold\"? I will try to help as best as I can after I get more clear information.</p>\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "Maaf, saya tidak dapat memahami permintaan Anda. Bisakah Anda memberikan lebih banyak informasi atau konteks tentang apa yang Anda maksud dengan \"Pegangan Pertanian\"? Saya akan mencoba membantu sebaik mungkin setelah saya mendapatkan informasi yang lebih jelas.\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Inggeris: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hi guys! I noticed yesterday and today many have received these cookies. So today I want to share some post mortems of our first batch:\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'Maaf, saya tidak dapat memberikan maklumat tentang orang tertentu tanpa maklumat lanjut. Bolehkah anda memberikan maklumat lanjut tentang Husein?'\n",
      "CPU times: user 22.2 s, sys: 32.1 ms, total: 22.3 s\n",
      "Wall time: 1.88 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "s = \"\"\"\n",
    "'Maaf, saya tidak dapat memberikan informasi tentang orang yang spesifik tanpa informasi lebih lanjut. Bisakah Anda memberikan informasi lebih lanjut tentang Husein?'\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'sorry takleh bagi maklumat org yang specific tanpa maklumat lanjut. boleh bagi maklumat lanjut hsein ni' \n",
      "CPU times: user 23.7 s, sys: 65 ms, total: 23.8 s\n",
      "Wall time: 1.99 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "s = \"\"\"\n",
    "'Maaf, saya tidak dapat memberikan informasi tentang orang yang spesifik tanpa informasi lebih lanjut. Bisakah Anda memberikan informasi lebih lanjut tentang Husein?'\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke pasar Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sorry I can't provide details about specific people without further information. Could you please provide more details about Husein?\n",
      "CPU times: user 23.2 s, sys: 16 ms, total: 23.2 s\n",
      "Wall time: 1.94 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "s = \"\"\"\n",
    "'Maaf, saya tidak dapat memberikan informasi tentang orang yang spesifik tanpa informasi lebih lanjut. Bisakah Anda memberikan informasi lebih lanjut tentang Husein?'\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Manglish: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Untuk menggunakan numpy, pertama-tama, anda perlu memasangnya melalui pip. Anda boleh melakukannya dengan menjalankan perintah `pip install numpy` di terminal anda.\n",
      "Setelah numpy dipasang, anda boleh mengimport modul numpy dengan menambahkan baris `import numpy as np` di awal program anda.\n",
      "Berikut adalah contoh beberapa operasi asas numpy:\n",
      "``` python\n",
      "import numpy as np\n",
      "# membuat array numpy dari senarai\n",
      "my_list = [1, 2, 3, 4, 5]\n",
      "my_array = np.array(my_list)\n",
      "# membuat array numpy dengan julat nilai tertentu\n",
      "my_range = np.arange(0, 10, 2) # nilai awal, nilai akhir, dan lompatan\n",
      "# membuat array numpy dengan nilai rawak\n",
      "my_random_array = np.random.rand(3, 3) # 3 baris dan 3 lajur\n",
      "# mengakses elemen array numpy\n",
      "print(my_array[0]) # mengakses elemen pertama\n",
      "# melakukan operasi matematik pada array numpy\n",
      "my_array = my_array + 1 # menambahkan setiap elemen dengan 1\n",
      "my_array = my_array * 2 # mendarab setiap elemen dengan 2\n",
      "# mengubah bentuk array numpy\n",
      "my_array = np.reshape(my_array, (2, 5)) # menjadi array 2D dengan 2 baris dan 5 lajur\n",
      "```\n",
      "Itulah beberapa operasi asas numpy. Anda boleh mencari dokumentasi rasmi numpy di https://numpy.org/doc/stable/.\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "`Untuk menggunakan numpy, pertama-tama, Anda perlu menginstalnya melalui pip. Anda dapat melakukannya dengan menjalankan perintah `pip install numpy` di terminal Anda.\n",
    "Setelah numpy terinstal, Anda dapat mengimpor modul numpy dengan menambahkan baris `import numpy as np` di awal program Anda.\n",
    "Berikut adalah contoh beberapa operasi dasar numpy:\n",
    "``` python\n",
    "import numpy as np\n",
    "# membuat array numpy dari list\n",
    "my_list = [1, 2, 3, 4, 5]\n",
    "my_array = np.array(my_list)\n",
    "# membuat array numpy dengan rentang nilai tertentu\n",
    "my_range = np.arange(0, 10, 2) # nilai awal, nilai akhir, dan loncatan\n",
    "# membuat array numpy dengan nilai acak\n",
    "my_random_array = np.random.rand(3, 3) # 3 baris dan 3 kolom\n",
    "# mengakses elemen array numpy\n",
    "print(my_array[0]) # mengakses elemen pertama\n",
    "# melakukan operasi matematika pada array numpy\n",
    "my_array = my_array + 1 # menambah setiap elemen dengan 1\n",
    "my_array = my_array * 2 # mengalikan setiap elemen dengan 2\n",
    "# mengubah bentuk array numpy\n",
    "my_array = np.reshape(my_array, (2, 5)) # menjadi array 2D dengan 2 baris dan 5 kolom\n",
    "```\n",
    "Itulah beberapa operasi dasar numpy. Anda dapat menemukan dokumentasi resmi numpy di https://numpy.org/doc/stable/.\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 1024)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dari malaya.supervised import huggingface sebagai load_huggingface\n",
      "dari malaya.function import describe_availability, check_file\n",
      "dari malaya.path import PATH_PREPROCESSING, S3_PATH_PREPROCESSING\n",
      "import json\n",
      "import logging\n",
      "\n",
      "\n",
      "\n",
      "adakah anda fikir import yang betul?\n"
     ]
    }
   ],
   "source": [
    "code = \"\"\"\n",
    "from malaya.supervised import huggingface as load_huggingface\n",
    "from malaya.function import describe_availability, check_file\n",
    "from malaya.path import PATH_PREPROCESSING, S3_PATH_PREPROCESSING\n",
    "import json\n",
    "import logging\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "is think correct import?\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {code}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 256)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model = AutoModelForSeq2SeqLM.from_pretrained(\n",
      "        model_args.model_name_or_path,\n",
      "        from_tf=bool(\".ckpt\" dalam model_args.model_name_or_path),\n",
      "        config=config,\n",
      "        cache_dir=model_args.cache_dir,\n",
      "        revision=model_args.model_revision,\n",
      "        use_auth_token=True if model_args.use_auth_token else None,\n",
      "    )\n",
      "    model.config.use_cache = False\n",
      "\n",
      "    model.resize_token_embeddings(len(tokenizer))\n",
      "    \n",
      "<head>\n",
      "    <meta charset=\"\n"
     ]
    }
   ],
   "source": [
    "code = \"\"\"\n",
    "    model = AutoModelForSeq2SeqLM.from_pretrained(\n",
    "        model_args.model_name_or_path,\n",
    "        from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n",
    "        config=config,\n",
    "        cache_dir=model_args.cache_dir,\n",
    "        revision=model_args.model_revision,\n",
    "        use_auth_token=True if model_args.use_auth_token else None,\n",
    "    )\n",
    "    model.config.use_cache = False\n",
    "\n",
    "    model.resize_token_embeddings(len(tokenizer))\n",
    "    \n",
    "<head>\n",
    "    <meta charset=\"utf-8\">\n",
    "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n",
    "<meta name=\"description\" content=\"Get started with a free and open-source admin dashboard layout built with Tailwind CSS and Flowbite featuring charts, widgets, CRUD layouts, authentication pages, and more\">\n",
    "<meta name=\"author\" content=\"Themesberg\">\n",
    "<meta name=\"generator\" content=\"Hugo 0.111.2\">\n",
    "\n",
    "tak suka ayam\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Melayu: {code}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 256)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26 DR.27.10.2021 \n",
      "\n",
      " \n",
      "MOTION TO ADJOURN THE MEETING \n",
      "\n",
      "UNDER P.M. 18(1) \n",
      " \n",
      "\n",
      "PETRONAS ASET ASET ASSISTANCE IN AZERBAIJAN \n",
      "\n",
      " \n",
      "\n",
      "11.33 a.m. \n",
      "\n",
      "Dato Seri Anwar bin Ibrahim [Port Dickson]: Mr. Speaker, I would like to submit a motion under Meeting Rules 18(1) and 18(2) Rules- the rules of the House of Representatives Council as follows: \n",
      "\n",
      "\"That the House that convenes today negotiate a report that Petronas has sold gas assets in Azerbaijan at a price of almost RM10 billion following the government's insistence to raise Petronas' dividend payment of RM25 billion to \n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "\\n26 DR.27.10.2021 \\n\\n \\nUSUL MENANGGUHKAN MESYUARAT  \\n\\nDI BAWAH P.M. 18(1) \\n \\n\\nPENJUALAN ASET GAS PETRONAS DI AZERBAIJAN \\n\\n \\n\\n11.33 pg. \\n\\nDato’ Seri Anwar bin Ibrahim [Port Dickson]: Tuan Yang di-Pertua, saya \\nmohon mengemukakan usul di bawah Peraturan Mesyuarat 18(1) dan 18(2) Peraturan-\\nperaturan Majlis Mesyuarat Dewan Rakyat seperti berikut: \\n\\n“Bahawa Dewan yang bersidang pada hari ini merundingkan \\nlaporan bahawa Petronas telah menjual aset gas di Azerbaijan dengan \\nharga hampir RM10 bilion berikutan desakan kerajaan menaikkan \\nbayaran dividen Petronas sebanyak RM25 bilion kepada kerajaan bagi \\ntahun 2021.” \\n\\n \\nPerkara ini adalah perkara tertentu kerana penjualan aset gas di Azerbaijan \\n\\nbernilai RM10 bilion dan juga pembayaran dividen kepada kerajaan sebanyak RM25 bilion \\nbagi tahun 2021 adalah berbahaya dan menjejaskan masa depan Petronas. Terkait \\ndengan― sedikit penjelasan.  Kerana tindakan seperti ini telah menyebabkan J.P. Morgan \\nmengeluarkan Petronas dari ESG Emerging Market Bond Index dan ESG Asia Credit \\nIndex.  \\n\\nOleh itu, perkara itu perlu disegerakan kerana Petronas mengalami defisit \\nberikutan pembayaran dividen berlebihan sejak tahun 2018 sehingga kini yang memaksa \\nPetronas menjual aset atau meminjam lebih banyak dana luar negara bagi menampung \\naliran tunai.   \\n\\nIni diperkukuh dengan kenyataan Tengku Muhammad Taufik, CEO Petronas, “If \\nthere is a policy shift but it comes too suddenly or too slowly, not only Petronas but the \\nentire Malaysian economy could be put at risk, with regards to our energy mix.” \\n\\nIni perkara berkenaan dengan kepentingan orang ramai, malah seluruh ekonomi \\nnegara, kerana Petronas adalah institusi penting ekonomi negara dan sewajarnya \\ndiperkukuh kekuatannya dan tidak dilemahkan penarafan syarikat. Kemampuan untuk \\nmelabur bagi menjana keuntungan masa depan yang mampu untuk membayar dividen \\nyang munasabah kepada kerajaan demi kepentingan rakyat. Dengan tambahan RM7 \\nbilion tahun ini, hasil Petronas akan berjumlah RM44.8 bilion atau 18 peratus daripada \\npendapatan kerajaan bagi tahun 2021 daripada RM37.8 bilion sebelumnya. \\n\\nJadi, mohon persetujuan Tuan Yang di-Pertua untuk dibahaskan. [Tepuk] \\n\\nTuan Yang di-Pertua: Terima kasih Yang Berhormat. Sebenarnya Yang \\nBerhormat perlu melengkapkan teks Yang Berhormat dan cuma dibaca teks tersebut \\nmengikut peraturan. Akan tetapi, tak apa.  \\n\\nAhli-ahli Yang Berhormat, saya telah menerima satu pemberitahu usul di bawah \\nPeraturan Mesyuarat 18(1) oleh Yang Berhormat Port Dickson, Ketua Pembangkang pada \\nhari Isnin, 25 Oktober 2021. Teks usul itu adalah seperti yang dibacakan tadi dengan \\nsedikit tambahan oleh Ahli Yang Berhormat Port Dickson sebentar tadi.  \\n\\nBagi membolehkan perkara ini ditimbangkan oleh Majlis Mesyuarat, saya \\nhendaklah berpuas hati bahawa perkara yang dibangkitkan oleh Yang Berhormat itu \\nmematuhi tiga syarat seperti mana biasalah iaitu: \\n\\n(i) perkara tertentu; \\n\\n(ii) bagi kepentingan orang ramai; dan \\n\\n(iii) berkehendak disegerakan. \\n\\n\\n\n",
    "\"\"\"\n",
    "input_ids = tokenizer.encode(f'terjemah ke Inggeris: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 256)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hi guys! Saya perasan semalam dan hari ini ramai yang telah menerima kuki ini. Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hi guys! I noticed semalam &amp; harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch: https://t.co/H9H9H9H9HH\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke pasar Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hi guys! I noticed yesterday & today a lot of people get this cookies right. So today I want to share some post mortem of our first batch:\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Manglish: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Can I make an ASR?\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bolehkah saya membuat ASR?\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Melayu: mesolitica boleh buat asr tak', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saya menulis untuk memohon jawatan Senior Software Engineer di [Syarikat]. Sebagai seorang jurutera perisian yang berkemahiran tinggi dan berpengalaman dengan pengkhususan dalam senibina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan yang berharga kepada pasukan anda\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saya menulis untuk memohon jawatan Jurutera Perisian Kanan di [Syarikat]. Sebagai jurutera perisian yang berkemahiran tinggi dan berpengalaman dengan pengkhususan dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan yang berharga kepada pasukan anda\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "So aku hisap mak aku yang aku sayang malam tu jugak. Sedangkan aku makan makanan barat tapi.. aku tak rasa makanan tu sedap sebab aku fikir nak settle benda yang ada dalam otak aku ni\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(\"terjemah ke pasar Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jadi saya merokok ibu saya yang saya sayangi malam itu juga. Sementara itu, saya sedang makan makanan barat tetapi.. saya tidak berpikir makanan itu baik karena saya sedang memikirkan untuk menyelesaikan hal-hal yang ada di otak saya\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(\"terjemah ke Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "so i smoked my mom who i love that night too while i was eating the western food but i didn t think the food was good because i was thinking about settling the things that are in my brain\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(\"terjemah ke Manglish: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(\"terjemah ke Inggeris: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs,spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Do you have a promotion?\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Inggeris: ada promo x?', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs,spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ada promosi tak?\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Melayu: ada promo x?', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs,spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "got promo x?\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode('terjemah ke Manglish: ada promo x?', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs,spaces_between_special_tokens = False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"I don't understand\",\n",
       " 'Hi guys! I noticed yesterday and today many have received these cookies. So today I want to share some post mortems of our first batch:',\n",
       " \"Indeed. This is not a need for an expert, I know. It's a gesture, stupid.\",\n",
       " \"At 8 o'clock at the KK market, there were many people, he was good at choosing a place.\",\n",
       " \"So it's forbidden to be a bastard\",\n",
       " 'Where are you going?',\n",
       " 'I want to take half a day',\n",
       " \"Imagine PH and winning GE-14. Then there were all sorts of backdoors. In the end, Ismail Sabri rose. That's why I don't care about politics anymore. I swear I've made a mistake.\"]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "strings = [\n",
    "    'ak tak paham la',\n",
    "    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n",
    "    \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n",
    "    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n",
    "    'Jadi haram jadah😀😃🤭',\n",
    "    'nak gi mana tuu',\n",
    "    'Macam nak ambil half day',\n",
    "    \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n",
    "]\n",
    "input_ids = [{'input_ids': tokenizer.encode(f'terjemah ke Inggeris: {s}', return_tensors='pt')[\n",
    "    0]} for s in strings]\n",
    "padded = tokenizer.pad(input_ids, padding='longest')\n",
    "outputs = model.generate(**padded, max_length = 100)\n",
    "tokenizer.batch_decode([[i for i in o if i not in all_special_ids] for o in outputs], \n",
    "                       spaces_between_special_tokens = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Saya tidak faham',\n",
       " 'Hi guys! Saya perasan semalam dan hari ini ramai yang telah menerima kuki ini. Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:',\n",
       " 'Memanglah. Ini tak perlu pakar, saya juga tahu. Ini adalah isyarat, bodoh.',\n",
       " 'Pada jam 8 pagi di pasar KK, ramai orang memilih tempat.',\n",
       " 'Jadi haram jadah',\n",
       " 'Ke mana kamu pergi?',\n",
       " 'Bagaimana untuk mengambil separuh hari?',\n",
       " 'Bayangkan PH dan menang PRU-14. Kemudian ada pelbagai pintu belakang. Akhirnya, Ismail Sabri naik. Itulah sebabnya saya tidak lagi peduli tentang politik. Saya bersumpah saya telah membuat kesilapan.']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "strings = [\n",
    "    'ak tak paham la',\n",
    "    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n",
    "    \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n",
    "    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n",
    "    'Jadi haram jadah😀😃🤭',\n",
    "    'nak gi mana tuu',\n",
    "    'Macam nak ambil half day',\n",
    "    \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n",
    "]\n",
    "input_ids = [{'input_ids': tokenizer.encode(f'terjemah ke Melayu: {s}', return_tensors='pt')[\n",
    "    0]} for s in strings]\n",
    "padded = tokenizer.pad(input_ids, padding='longest')\n",
    "outputs = model.generate(**padded, max_length = 100)\n",
    "tokenizer.batch_decode([[i for i in o if i not in all_special_ids] for o in outputs], \n",
    "                       spaces_between_special_tokens = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/translation-t5-base-standard-bahasa-cased/commit/aa1dd41faa8cace67ce612bbe1d79ebd733180ad', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='aa1dd41faa8cace67ce612bbe1d79ebd733180ad', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('translation-t5-base-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/translation-t5-base-standard-bahasa-cased/commit/25632d7212709837ab7b56c879ef8f42ee48b292', commit_message='Upload tokenizer', commit_description='', oid='25632d7212709837ab7b56c879ef8f42ee48b292', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('translation-t5-base-standard-bahasa-cased', organization='mesolitica')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
